1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING.  If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32fc_x2_divide_32fc
25  *
26  * \b Overview
27  *
28  * Divide first vector of complexes element-wise by second.
29  *
30  * <b>Dispatcher Prototype</b>
31  * \code
32  * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, const lv_32fc_t* denumeratorVector, unsigned int num_points);
33  * \endcode
34  *
35  * \b Inputs
36  * \li numeratorVector: The numerator complex values.
37  * \li numeratorVector: The denumerator complex values.
38  * \li num_points: The number of data points.
39  *
40  * \b Outputs
41  * \li outputVector: The output vector complex floats.
42  *
43  * \b Example
44  * divide a complex vector by itself, demonstrating the result should be pretty close to 1+0j.
45  *
46  * \code
47  *   int N = 10;
48  *   unsigned int alignment = volk_get_alignment();
49  *   lv_32fc_t* input_vector  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
50  *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
51  *
52  *   float delta = 2.f*M_PI / (float)N;
53  *   for(unsigned int ii = 0; ii < N; ++ii){
54  *       float real_1 = std::cos(0.3f * (float)ii);
55  *       float imag_1 = std::sin(0.3f * (float)ii);
56  *       input_vector[ii] = lv_cmake(real_1, imag_1);
57  *   }
58  *
59  *   volk_32fc_x2_divide_32fc(out, input_vector, input_vector, N);
60  *
61  *   for(unsigned int ii = 0; ii < N; ++ii){
62  *       printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
63  *   }
64  *   printf("\n");
65  *
66  *   volk_free(input_vector);
67  *   volk_free(out);
68  * \endcode
69  */
70 
71 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
72 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
73 
74 #include <inttypes.h>
75 #include <volk/volk_complex.h>
76 #include <float.h>
77 
78 #ifdef LV_HAVE_SSE3
79 #include <pmmintrin.h>
80 #include <volk/volk_sse3_intrinsics.h>
81 
82 static inline void
volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t * cVector,const lv_32fc_t * numeratorVector,const lv_32fc_t * denumeratorVector,unsigned int num_points)83 volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
84                                             const lv_32fc_t* denumeratorVector, unsigned int num_points)
85 {
86     /*
87      * we'll do the "classical"
88      *  a      a b*
89      * --- = -------
90      *  b     |b|^2
91      * */
92   unsigned int number = 0;
93   const unsigned int quarterPoints = num_points / 4;
94 
95   __m128 num01, num23, den01, den23, norm, result;
96   lv_32fc_t* c = cVector;
97   const lv_32fc_t* a = numeratorVector;
98   const lv_32fc_t* b = denumeratorVector;
99 
100   for(; number < quarterPoints; number++){
101     num01 = _mm_loadu_ps((float*) a);    // first pair
102     den01 = _mm_loadu_ps((float*) b);    // first pair
103     num01 = _mm_complexconjugatemul_ps(num01, den01);   // a conj(b)
104     a += 2;
105     b += 2;
106 
107     num23 = _mm_loadu_ps((float*) a);    // second pair
108     den23 = _mm_loadu_ps((float*) b);    // second pair
109     num23 = _mm_complexconjugatemul_ps(num23, den23);   // a conj(b)
110     a += 2;
111     b += 2;
112 
113     norm = _mm_magnitudesquared_ps_sse3(den01, den23);
114     den01 = _mm_unpacklo_ps(norm,norm);
115     den23 = _mm_unpackhi_ps(norm,norm);
116 
117     result = _mm_div_ps(num01, den01);
118     _mm_storeu_ps((float*) c, result); // Store the results back into the C container
119     c += 2;
120     result = _mm_div_ps(num23, den23);
121     _mm_storeu_ps((float*) c, result); // Store the results back into the C container
122     c += 2;
123   }
124 
125   number *= 4;
126   for(;number < num_points; number++){
127     *c = (*a) / (*b);
128     a++; b++; c++;
129   }
130 }
131 #endif /* LV_HAVE_SSE3 */
132 
133 
134 #ifdef LV_HAVE_AVX
135 #include <immintrin.h>
136 #include <volk/volk_avx_intrinsics.h>
137 
138 static inline void
volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t * cVector,const lv_32fc_t * numeratorVector,const lv_32fc_t * denumeratorVector,unsigned int num_points)139 volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
140                                             const lv_32fc_t* denumeratorVector, unsigned int num_points)
141 {
142     /*
143      * we'll do the "classical"
144      *  a      a b*
145      * --- = -------
146      *  b     |b|^2
147      * */
148     unsigned int number = 0;
149     const unsigned int quarterPoints = num_points / 4;
150 
151     __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
152     lv_32fc_t* c = cVector;
153     const lv_32fc_t* a = numeratorVector;
154     const lv_32fc_t* b = denumeratorVector;
155 
156     for(; number < quarterPoints; number++){
157         num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
158         denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
159         mul_conj = _mm256_complexconjugatemul_ps(num, denum);
160         sq = _mm256_mul_ps(denum, denum); // Square the values
161         mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
162         mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
163         // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
164         div = _mm256_div_ps(mul_conj,mag_sq);
165 
166         _mm256_storeu_ps((float*) c, div); // Store the results back into the C container
167 
168         a += 4;
169         b += 4;
170         c += 4;
171     }
172 
173     number = quarterPoints * 4;
174 
175     for(; number < num_points; number++){
176         *c++ = (*a++) / (*b++);
177     }
178 
179 }
180 #endif /* LV_HAVE_AVX */
181 
182 
183 #ifdef LV_HAVE_GENERIC
184 
185 static inline void
volk_32fc_x2_divide_32fc_generic(lv_32fc_t * cVector,const lv_32fc_t * aVector,const lv_32fc_t * bVector,unsigned int num_points)186 volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
187                                              const lv_32fc_t* bVector, unsigned int num_points)
188 {
189   lv_32fc_t* cPtr = cVector;
190   const lv_32fc_t* aPtr = aVector;
191   const lv_32fc_t* bPtr=  bVector;
192   unsigned int number = 0;
193 
194   for(number = 0; number < num_points; number++){
195     *cPtr++ = (*aPtr++) / (*bPtr++);
196   }
197 }
198 #endif /* LV_HAVE_GENERIC */
199 
200 
201 
202 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
203 
204 
205 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
206 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
207 
208 #include <inttypes.h>
209 #include <stdio.h>
210 #include <volk/volk_complex.h>
211 #include <float.h>
212 
213 #ifdef LV_HAVE_SSE3
214 #include <pmmintrin.h>
215 #include <volk/volk_sse3_intrinsics.h>
216 
217 static inline void
volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t * cVector,const lv_32fc_t * numeratorVector,const lv_32fc_t * denumeratorVector,unsigned int num_points)218 volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
219                                             const lv_32fc_t* denumeratorVector, unsigned int num_points)
220 {
221     /*
222      * we'll do the "classical"
223      *  a      a b*
224      * --- = -------
225      *  b     |b|^2
226      * */
227   unsigned int number = 0;
228   const unsigned int quarterPoints = num_points / 4;
229 
230   __m128 num01, num23, den01, den23, norm, result;
231   lv_32fc_t* c = cVector;
232   const lv_32fc_t* a = numeratorVector;
233   const lv_32fc_t* b = denumeratorVector;
234 
235   for(; number < quarterPoints; number++){
236     num01 = _mm_load_ps((float*) a);    // first pair
237     den01 = _mm_load_ps((float*) b);    // first pair
238     num01 = _mm_complexconjugatemul_ps(num01, den01);   // a conj(b)
239     a += 2;
240     b += 2;
241 
242     num23 = _mm_load_ps((float*) a);    // second pair
243     den23 = _mm_load_ps((float*) b);    // second pair
244     num23 = _mm_complexconjugatemul_ps(num23, den23);   // a conj(b)
245     a += 2;
246     b += 2;
247 
248     norm = _mm_magnitudesquared_ps_sse3(den01, den23);
249 
250     den01 = _mm_unpacklo_ps(norm,norm); // select the lower floats twice
251     den23 = _mm_unpackhi_ps(norm,norm); // select the upper floats twice
252 
253     result = _mm_div_ps(num01, den01);
254     _mm_store_ps((float*) c, result); // Store the results back into the C container
255     c += 2;
256     result = _mm_div_ps(num23, den23);
257     _mm_store_ps((float*) c, result); // Store the results back into the C container
258     c += 2;
259   }
260 
261   number *= 4;
262   for(;number < num_points; number++){
263     *c = (*a) / (*b);
264     a++; b++; c++;
265   }
266 }
267 #endif /* LV_HAVE_SSE */
268 
269 #ifdef LV_HAVE_AVX
270 #include <immintrin.h>
271 #include <volk/volk_avx_intrinsics.h>
272 
273 static inline void
volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t * cVector,const lv_32fc_t * numeratorVector,const lv_32fc_t * denumeratorVector,unsigned int num_points)274 volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
275                                             const lv_32fc_t* denumeratorVector, unsigned int num_points)
276 {
277     /*
278      * we'll do the "classical"
279      *  a      a b*
280      * --- = -------
281      *  b     |b|^2
282      * */
283     unsigned int number = 0;
284     const unsigned int quarterPoints = num_points / 4;
285 
286     __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
287     lv_32fc_t* c = cVector;
288     const lv_32fc_t* a = numeratorVector;
289     const lv_32fc_t* b = denumeratorVector;
290 
291     for(; number < quarterPoints; number++){
292         num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
293         denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
294         mul_conj = _mm256_complexconjugatemul_ps(num, denum);
295         sq = _mm256_mul_ps(denum, denum); // Square the values
296         mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
297         mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
298         // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
299         div = _mm256_div_ps(mul_conj,mag_sq);
300 
301         _mm256_store_ps((float*) c, div); // Store the results back into the C container
302 
303         a += 4;
304         b += 4;
305         c += 4;
306     }
307 
308     number = quarterPoints * 4;
309 
310     for(; number < num_points; number++){
311         *c++ = (*a++) / (*b++);
312     }
313 
314 
315 }
316 #endif /* LV_HAVE_AVX */
317 
318 #ifdef LV_HAVE_NEON
319 #include <arm_neon.h>
320 
321 static inline void
volk_32fc_x2_divide_32fc_neon(lv_32fc_t * cVector,const lv_32fc_t * aVector,const lv_32fc_t * bVector,unsigned int num_points)322 volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
323 			      const lv_32fc_t* bVector, unsigned int num_points)
324 {
325   lv_32fc_t* cPtr = cVector;
326   const lv_32fc_t* aPtr = aVector;
327   const lv_32fc_t* bPtr = bVector;
328 
329   float32x4x2_t aVal, bVal, cVal;
330   float32x4_t bAbs, bAbsInv;
331 
332   const unsigned int quarterPoints = num_points / 4;
333   unsigned int number = 0;
334   for(; number < quarterPoints; number++){
335     aVal = vld2q_f32((const float*)(aPtr));
336     bVal = vld2q_f32((const float*)(bPtr));
337     aPtr += 4;
338     bPtr += 4;
339     __VOLK_PREFETCH(aPtr+4);
340     __VOLK_PREFETCH(bPtr+4);
341 
342     bAbs = vmulq_f32(      bVal.val[0], bVal.val[0]);
343     bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
344 
345     bAbsInv = vrecpeq_f32(bAbs);
346     bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
347     bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
348 
349     cVal.val[0] = vmulq_f32(             aVal.val[0], bVal.val[0]);
350     cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
351     cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
352 
353     cVal.val[1] = vmulq_f32(             aVal.val[1], bVal.val[0]);
354     cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
355     cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
356 
357     vst2q_f32((float*)(cPtr), cVal);
358     cPtr += 4;
359   }
360 
361   for(number = quarterPoints * 4; number < num_points; number++){
362     *cPtr++ = (*aPtr++) / (*bPtr++);
363   }
364 }
365 #endif /* LV_HAVE_NEON */
366 
367 
368 #ifdef LV_HAVE_GENERIC
369 
370 static inline void
volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t * cVector,const lv_32fc_t * aVector,const lv_32fc_t * bVector,unsigned int num_points)371 volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
372                                                const lv_32fc_t* bVector, unsigned int num_points)
373 {
374   lv_32fc_t* cPtr = cVector;
375   const lv_32fc_t* aPtr = aVector;
376   const lv_32fc_t* bPtr=  bVector;
377   unsigned int number = 0;
378 
379   for(number = 0; number < num_points; number++){
380     *cPtr++ = (*aPtr++)  / (*bPtr++);
381   }
382 }
383 #endif /* LV_HAVE_GENERIC */
384 
385 
386 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
387